# load libraries
library(tidyverse)
library(sf)
library(lubridate)
library(riem)
library(imputeTS)
library(gridExtra)
library(FNN)
library(kableExtra)
library(gganimate)
library(gifski)
library(caret)

# set D.C. coordinate reference system (NAD83/Maryland)
dc_crs <- "EPSG:2248"

# load city boundary for cartographic context
dc <- 
  st_read("https://opendata.arcgis.com/datasets/7241f6d500b44288ad983f0942b39663_10.geojson") %>%
  st_transform(dc_crs)

# load "neighborhood clusters" for cartographic context
dc_nhoods <-
  st_read("https://opendata.arcgis.com/datasets/f6c703ebe2534fc3800609a07bad8f5b_17.geojson") %>%
  st_transform(dc_crs)

# dc_streets <-
#   st_read("https://opendata.arcgis.com/datasets/6b05d209ce1f45f1b6c537fac8cec386_48.geojson") %>%
#   st_union() %>%
#   st_transform(dc_crs)

# load book functions
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")

# set color palettes
palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#d2fbd4","#92bcab","#527d82","#123f5A")
palette2 <- c("#6baed6","#08519c")

# block scientific notation
options(scipen = 999)

# set map styling options
mapTheme <- function() {
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.background = element_blank(),
    # plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(face = "italic"),
    plot.caption = element_text(hjust = 0)
  )
}

# set plot styling options
plotTheme <- function() {
  theme(
    axis.ticks = element_blank(),
    legend.title = element_blank(),
    panel.background = element_blank(),
    panel.border = element_blank(),
    panel.grid.major = element_line(color = "gray75", size = 0.1),
    panel.grid.minor = element_line(color = "gray75", size = 0.1),
    # plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(face = "italic"),
    plot.caption = element_text(hjust = 0))
}

Introduction

In not much more than a decade, the idea of unlocking bike at a street corner, riding it across town, and then dropping it at your destination has gone from a notion that might raise eyebrows to an integral part of major cities’ transit systems. Unlike other transit, bikeshare follows no set routes and has no hours of operation – which makes it convenient to use, but complex to operate. How does a bikeshare system operator ensure that riders don’t arrive at a docking station only to find no bikes, or no space to park the one they’re on?

One answer to this might be to attempt to predict demand for bikes or docks based on historic ride data and other relevant features, and to then “rebalance” the bikes across time and space in response to that demand. In this analysis, I explored techniques for forecasting demand across time and space as applied to Washington, D.C.’s bikeshare system, Capital Bikeshare (“CaBi”), which has been in place since September 2010. As of December 2021, CaBi had 67 stations across the District of Columbia, Maryland, and Virginia. To tighten the scope of the analysis (and reduce processing time), I chose to focus on the 335 stations located in the District proper. I also limited the scope to the five weeks between Monday, October 4, 2021 and Sunday, November 7, 2021.

The timeframe for the predictions generated in this exercise is quite short: I did not create a model to predict bikeshare demand weeks, months, or years from now. Instead, the premise here is to provide actionable information to system operators who expect to be rebalancing bikes at various times throughout the day, but who could use actionable information about what docks to prioritize.

Though CaBi offers both docked “classic” bikes and dockless electric bikes, I omitted the dockless bikes from this analysis given the very different rebalancing dynamics at hand.

# load bikeshare stations in D.C. proper
stations <- 
  st_read("https://opendata.arcgis.com/datasets/a1f7acf65795451d89f0a38565a975b3_5.geojson") %>%
  filter(REGION_ID == 42) %>% # District of Columbia
  dplyr::select(station_name = NAME, geometry) %>%
  arrange(station_name) %>%
  st_transform(dc_crs)

# load and wrangle bikeshare trip data
trips_2021_10 <- read.csv("202110-capitalbikeshare-tripdata.csv") %>%
  mutate(started_at = ymd_hms(started_at)) %>%
  filter(started_at >= as.Date("2021-10-03")) # 2021-10-04 start + 1 day lag

trips_2021_11 <- read.csv("202111-capitalbikeshare-tripdata.csv") %>%
  mutate(started_at = ymd_hms(started_at)) %>%
  filter(started_at < as.Date("2021-11-08"))

trips <- rbind(trips_2021_10, trips_2021_11) %>%
  filter(!is.na(start_station_id)) %>% # remove trips with dockless start
  dplyr::select(station_name = start_station_name, started_at) %>%
  arrange(station_name, started_at) %>%
  mutate(
    station_name = if_else(station_name == "11th & H St NE", 
      "10th & H St NE", as.character(station_name)),
    hour = floor_date(started_at, unit = "hour"),
    quarter_hour = floor_date(started_at, unit = "15 min"),
    week = isoweek(started_at),
    day = wday(started_at, label = TRUE)
  ) %>%
  filter(station_name %in% stations$station_name)

Feature engineering

In addition to data on trip origin stations and start times, I incorporated information about weather along with a variety of time-based features. I explored other features based on the proximity of stations to amenities, but did not use them for reasons described later on.

Weather

Of all the factors likely to influence someone’s decision to get on a bike, weather seems like a top candidate. In addition to temperature, precipitation, and wind speed, I included relative humidity, the bane of any D.C. resident trying to get outside (if not quite as much during the study period as at other times of the year).

I didn’t love – or fully understand – the way the book dealt with aggregate and missing data for the weather time series*, so I calculated hourly means for my weather features and used the imputeTS package to interpolate missing values instead. I wasn’t sure how well this would work when I realized that of the 816 hours in my panel, the 19 with no temperature or precipitation data were consecutive, but linear interpolation seemed to work well enough (for a homework assignment, anyway) to impute temperature and relative humidity. For precipitation, I took the 100% non-scalable approach of confirming there was no precipitation during the relevant window and hard-coding it to 0.

*Replacing NAs with 0s, taking the max value per hour, and then replacing any remaining 0s with 42, for some reason (maybe some sort of seasonal mean for Chicago?).

# load weather data
weather_data <- 
  riem_measures(
    station = "DCA", date_start = "2021-10-04", date_end = "2021-11-08"
  ) %>%
  dplyr::select(
    time = valid,
    temperature = tmpf,
    dewpoint = dwpf,
    humidity = relh,
    windspeed = sknt,
    precip_1h = p01i,
    visibility = vsby,
    gust = gust,
    sky_1 = skyc1,
    peak_wind_gust,
    feel
  ) %>%
  mutate(
    hour = floor_date(time, unit = "hour"),
    clear = if_else(sky_1 == "CLR", 1, 0),
    overcast = if_else(sky_1 == "OVC", 1, 0)
  )

# create weather panel and impute missing values
weather_panel <-
  weather_data %>%
  group_by(hour) %>%
  summarize(
    temperature = mean(temperature, na.rm = TRUE),
    precip_1h = mean(precip_1h, na.rm = TRUE),
    windspeed = mean(windspeed, na.rm = TRUE),
    humidity = mean(humidity, na.rm = TRUE),
  ) %>%
  mutate(
    temperature_imputed = na_interpolation(temperature),
    precip_1h_imputed = na_replace(precip_1h, fill = 0),
    humidity_imputed = na_interpolation(humidity)
  )

temp_distrib_chart <-
  ggplot_na_distribution(
    weather_panel$temperature,
    subtitle = "Temperature",
    ylab = "Degrees F",
    xlab = "Hour"
  ) + 
  plotTheme()

temp_impute_chart <-
  ggplot_na_imputations(
    weather_panel$temperature,
    weather_panel$temperature_imputed,
    subtitle = "Temperature",
    ylab = "Degrees F",
    xlab = "Hour",
    legend = FALSE
  ) + 
  plotTheme()

precip_distrib_chart <- 
  ggplot_na_distribution(
    weather_panel$precip_1h, 
    subtitle = "Precipitation in last hour",
    ylab = "Inches",
    xlab = "Hour"
  ) + 
  plotTheme()

precip_impute_chart <- ggplot_na_imputations(
    weather_panel$precip_1h,
    weather_panel$precip_1h_imputed,
    subtitle = "Precipitation in last hour",
    ylab = "Inches",
    xlab = "Hour",
    legend = FALSE
  ) + 
  plotTheme()

humid_distrib_chart <-
  ggplot_na_distribution(
    weather_panel$humidity, 
    subtitle = "Relative humidity",
    ylab = "Percent",
    xlab = "Hour"
  ) + 
  plotTheme()

humid_impute_chart <- ggplot_na_imputations(
    weather_panel$humidity,
    weather_panel$humidity_imputed,
    subtitle = "Relative humidity",
    ylab = "Percent",
    xlab = "Hour",
    legend = FALSE
  ) + 
  plotTheme()

grid.arrange(
  temp_impute_chart, 
  precip_impute_chart,
  humid_impute_chart,
  ncol = 1)

# drop columns with missing values from final panel
weather_panel <-
  weather_panel %>%
  dplyr::select(-temperature, -precip_1h, -humidity)

As the charts below show, precipitation was nonexistent or low during most of the study period, with just a handful of major rain events in late October. Temperatures trended downward, as might be expected at this time of year; so did humidity to some extent, but much less dramatically. Winds were generally higher in the second half of October but followed no clear trends.

# grid.arrange(
#   ncol= 1,
#   top = "Weather data - Washington National Airport (DCA) - October & November, 2021",
#   ggplot(weather_panel, aes(hour, precip_1h_imputed)) + geom_line() + 
#     labs(title = "Precipitation", x = "Hour", y = "Inches/last hour"),
#   ggplot(weather_panel, aes(hour, temperature_imputed)) + geom_line() +
#     labs(title = "Temperature", x = "Hour", y = "Degrees F"),
#   ggplot(weather_panel, aes(hour, windspeed)) + geom_line() +
#     labs(title = "Wind speed", x = "Hour", y = "Knots"),
#   ggplot(weather_panel, aes(hour, humidity_imputed)) + geom_line() +
#     labs(title = "Humidity", x = "Hour", y = "Percent")
# )

grid.arrange(
  ncol= 1,
  top = "Weather data - Washington National Airport (DCA) - October & November, 2021",
  ggplot(weather_panel, aes(hour, precip_1h_imputed)) + geom_line() + 
    labs(title = "Precipitation in last hour", x = "Hour", y = "Inches") + plotTheme(),
  ggplot(weather_panel, aes(hour, temperature_imputed)) + geom_line() +
    labs(title = "Temperature", x = "Hour", y = "Degrees F") + plotTheme(),
  ggplot(weather_panel, aes(hour, windspeed)) + geom_line() +
    labs(title = "Wind speed", x = "Hour", y = "Knots") + plotTheme(),
  ggplot(weather_panel, aes(hour, humidity_imputed)) + geom_line() +
    labs(title = "Humidity", x = "Hour", y = "Percent") + plotTheme()
)

Time

The trip data presented opportunities for a number of time-based features beyond the start time of each individual trip. First, I extracted features for week of the year and day of the week, to capture information that might vary cyclically; next, I created time lag features representing the number of trips originating from each station at earlier points in time.

To ensure that all trips in the study period had a full set of lagged features, I imported trip data starting one day earlier, which I then dropped from the final trip time series, or panel.

I included visualizations related to these time features under “Exploratory analysis” below.

# create empty panel of all station-hour combinations
empty_panel <- 
  expand_grid(
    hour = unique(trips$hour),
    station_name = unique(stations$station_name)
  ) %>%
  arrange(station_name, hour)

# compute trip counts by station
trip_summary <-
  trips %>%
  group_by(hour, station_name) %>%
  summarize(trips = n())

# create trip panel with weather and time lag features
trip_panel <- 
  left_join(empty_panel, trip_summary) %>%
  replace_na(list(trips = 0)) %>%
  left_join(weather_panel, by = "hour") %>%
  mutate(
    week = isoweek(hour),
    day = wday(hour, label = TRUE)
  ) %>%
  arrange(station_name, hour) %>%
  group_by(station_name) %>%
  mutate(
    lag_1h = dplyr::lag(trips, 1),
    lag_2h = dplyr::lag(trips, 2),
    lag_3h = dplyr::lag(trips, 3),
    lag_4h = dplyr::lag(trips, 4),
    lag_12h = dplyr::lag(trips, 12),
    lag_24h = dplyr::lag(trips, 24)
  ) %>%
  ungroup() %>%
  filter(hour >= as.Date("2021-10-04"))

Amenities

# load metro station entrances
metro_entrances <- 
  st_read("https://opendata.arcgis.com/datasets/ab5661e1a4d74a338ee51cd9533ac787_50.geojson") %>%
  st_transform(dc_crs)

# load liquor licenses and select restaurants
restaurants <-
  st_read("https://opendata.arcgis.com/datasets/cabe9dcef0b344518c7fae1a3def7de1_5.geojson") %>%
  filter(TYPE == "Restaurant") %>%
  st_transform(dc_crs)

# load museums
museums <- 
  st_read("https://opendata.arcgis.com/datasets/2e65fc16edc3481989d2cc17e6f8c533_54.geojson") %>%
  st_transform(dc_crs)

# create amenity features at the bikeshare station level
station_amenities <-
  stations %>%
  mutate(
    metro_distance = nn_function(st_coordinates(.), st_coordinates(metro_entrances), 1),
    museum_nn3 = nn_function(st_coordinates(.), st_coordinates(museums), 3),
    restaurants_nn5 = nn_function(st_coordinates(.), st_coordinates(restaurants), 5),
  ) %>%
  st_drop_geometry()
  
# select example stations to reduce dataset size and processing time
example_stations <- c(
  "Lamont & Mt Pleasant NW",
  "Neal St & Trinidad Ave NE",
  "Union Market"
)

# join station amenities to trip data
trip_amenities_sample <- 
  left_join(
    filter(trips, station_name %in% example_stations), 
    station_amenities
  ) %>%
  dplyr::select(
    started_at, station_name, metro_distance, museum_nn3, restaurants_nn5
  )

The book chapter established that spatial exposure features tend to be redundant in this type of spatiotemporal model, because the spatial variation they attempt to capture is already controlled for by the variable used as the spatial unit of analysis. To see whether this held true here, I engineered three spatial features that seemed plausibly associated with higher-than-average bikeshare traffic:

  • Distance to nearest Metro station entrance (source): Because we like multimodal transportation here.
  • Mean distance to three nearest museums (source): Bikeshare is a convenient way to get around for tourists, probably.
  • Mean distance to five nearest restaurants (source): D.C. has some great ones, so why not bike there? “Restaurant”-type liquor licenses used as proxy for restaurants.

The table below, a sample from the full results, shows what we might have expected: that the spatial amenity features are perfectly collinear with the station name variable, and can therefore safely be excluded from the model.

# create table of sample results
trip_amenities_table <- 
  trip_amenities_sample %>%
  group_by(station_name) %>%
  slice_sample(n = 5)

kbl(trip_amenities_table, 
    caption = "Spatial amenity features at the trip and station level") %>%
  kable_classic("striped", full_width = FALSE) %>%
  footnote(general = "Excerpted from full dataset")
Spatial amenity features at the trip and station level
started_at station_name metro_distance museum_nn3 restaurants_nn5
2021-10-23 07:00:10 Lamont & Mt Pleasant NW 2031.269 2550.705 382.6872
2021-10-13 12:54:50 Lamont & Mt Pleasant NW 2031.269 2550.705 382.6872
2021-10-18 07:16:58 Lamont & Mt Pleasant NW 2031.269 2550.705 382.6872
2021-11-07 13:10:13 Lamont & Mt Pleasant NW 2031.269 2550.705 382.6872
2021-10-26 08:16:14 Lamont & Mt Pleasant NW 2031.269 2550.705 382.6872
2021-11-01 06:45:58 Neal St & Trinidad Ave NE 4781.631 4826.410 1210.6935
2021-10-21 10:14:43 Neal St & Trinidad Ave NE 4781.631 4826.410 1210.6935
2021-10-21 10:14:40 Neal St & Trinidad Ave NE 4781.631 4826.410 1210.6935
2021-10-24 11:35:07 Neal St & Trinidad Ave NE 4781.631 4826.410 1210.6935
2021-11-03 07:43:57 Neal St & Trinidad Ave NE 4781.631 4826.410 1210.6935
2021-11-07 12:39:24 Union Market 1823.240 3593.770 362.7717
2021-10-04 08:32:59 Union Market 1823.240 3593.770 362.7717
2021-10-17 13:36:31 Union Market 1823.240 3593.770 362.7717
2021-10-23 10:55:18 Union Market 1823.240 3593.770 362.7717
2021-10-25 07:59:07 Union Market 1823.240 3593.770 362.7717
Note:
Excerpted from full dataset

Exploratory analysis

After creating the panel, I split the data into a training set consisting of trips taken in the three weeks from October 4 to October 24, and a test set consisting of trips taken in the two weeks from October 25 to November 7. Before creating and testing the model, however, I conducted exploratory analysis on the trip data along with the new features.

# create training and test sets

# training: mon 10/4 - sun 10/24
training <- filter(trip_panel, hour < as.Date("2021-10-25"))

# test: mon 10/25 - sun 11/7
test <- filter(trip_panel, hour >= as.Date("2021-10-25"))

Trips across time

As the chart below shows, bikeshare trips follow a clear daily cycle, peaking during the day and plummeting overnight. Most weekdays also show two distinct peaks for the morning and evening rush. Halloween, which fell on a Sunday this year, was not clearly associated with any unusual trip volumes or patterns.

mondays <-
  mutate(trip_panel, monday = ifelse(day == "Mon", hour(hour) == 1, 0)) %>%
  filter(monday != 0) %>%
  dplyr::select(hour) %>%
  distinct()
  
halloween <- as.POSIXct("2021-10-31 01:00:00 UTC-4")

rbind(
  mutate(training, Legend = "Training"),
  mutate(test, Legend = "Test")
) %>%
  group_by(Legend, hour) %>%
  summarize(trips = sum(trips)) %>%
  ungroup() %>%
  ggplot(aes(hour, trips, color = Legend)) +
    geom_line() +
    scale_color_manual(values = palette2) +
    geom_vline(xintercept = halloween, linetype = "dotted") +
    geom_vline(data = mondays, aes(xintercept = hour), size = 0.25) +
    labs(
      title = "Trips by week: October-November",
      subtitle = "Solid lines for Mondays; dotted line for Halloween",
      x = "Date",
      y = "Trips"
    ) +
    plotTheme() +
    theme(
      #panel.grid.major = element_blank(), 
      legend.position = "bottom")

The charts below clearly illustrate the distinct weekday and weekend trends by time of day.

freqpoly_chart_1 <-
  ggplot(trips %>% mutate(h = hour(started_at))) +
    geom_freqpoly(aes(h, color = day), binwidth = 1) +
    labs(
      title = "Trips by time and day of week",
      subtitle = "October-November 2021",
      x = "Hour",
      y = "Trips"
    ) +
    plotTheme() +
    theme(legend.position = "bottom")

freqpoly_chart_2 <-
  ggplot(trips %>% mutate(
    h = hour(started_at),
    weekend = ifelse(day %in% c("Sat", "Sun"), "Weekend", "Weekday")
  )) +
    geom_freqpoly(aes(h, color = weekend), binwidth = 1) +
    labs(
      title = "", # Trips by time, weekend and weekday
      subtitle = "",
      x = "",
      y = "Trips"
    ) +
    plotTheme() +
    theme(legend.position = "bottom")

grid.arrange(freqpoly_chart_1, freqpoly_chart_2, ncol = 2)

Trips and lagged trips

The correlations between trips and lagged trips are weak at best: if not for the helpful blue lines, I would be hard pressed to find a linear relationship in any of these plots. Perhaps unsurprisingly, the strongest of the weak relationships are found between current trips and lagged trips a) an hour ago, b) the same hour the previous day. Also unsurprisingly, trips 12 hours ago say basically nothing about trips at the present time.

The book found much more convincing relationships in its analysis of downtown Chicago rideshare data, and I’m wondering to what extent this is a way that rideshare differs from bikeshare; Chicago differs from D.C.; the larger unit of analysis (Census tract vs. bikeshare station) really helps here; or any number of other potential explanations.

lag_plot_data <-
  filter(as.data.frame(trip_panel), week == min(trip_panel$week)) %>%
  dplyr::select(starts_with("lag"), trips) %>%
  gather(variable, value, -trips) %>%
  mutate(
    variable = fct_relevel(
      variable, "lag_1h", "lag_2h", "lag_3h", "lag_4h", "lag_12h", "lag_24h"
    )
  )

lag_correlations <-
  group_by(lag_plot_data, variable) %>%
  summarize(
    correlation = round(
      cor(value, trips, use = "complete.obs"), 2
    )
  )

ggplot(lag_plot_data, aes(value, trips)) +
  geom_point(size = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_label(data = lag_correlations,
            aes(label = paste("r = ", round(correlation, 2))),
            x = Inf, y = Inf, vjust = 1.5, hjust = 1.1, 
            label.size = 0) +
  facet_wrap(~variable, ncol = 6) +
  labs(
    title = "Capital Bikeshare trips as a function of lagged trips",
    subtitle = "First week of October, 2021",
    x = "Lagged trips",
    y = "Trips"
  )

Trips across space

Mapping trip totals by station shows a clear concentration of trip origins in and around the downtown CBD, along with several stations with top trip counts near the National Mall, museums, and waterfront.

trip_map_data <-
  left_join(stations, trips) %>%
  drop_na() %>%
  group_by(station_name) %>%
  tally() %>%
  arrange(n) %>%
  st_sf()

ggplot() +
  geom_sf(data = dc_nhoods, fill = "gray98", color = "gray85", size = 0.2) +
  geom_sf(data = trip_map_data, aes(color = n), size = 3) +
  scale_color_viridis_c("Trips", direction = -1, option = "E") +
  labs(title = "Trip origins by station, Oct-Nov 2021",
       subtitle = "District of Columbia") +
  mapTheme()

Trips across time and space

The maps below add a temporal dimension to the spatial data, disaggregating station trip counts into weekend and weekday counts by time of day. Weekday morning and evening trips are consistent with commuting patterns into and out of downtown; overnight trips concentrate in areas dense with restaurants and nightlife. On the weekend, the highest trip counts are found near museums and tourist attractions, though the “weekend mid-day” shadows also suggest brunch; somehow, no one bikes anywhere before 10 am on a weekend morning.

spatiotemporal_map_data <-
  left_join(stations, trips) %>%
  drop_na() %>%
  mutate(
    hour = hour(started_at),
    weekend = ifelse(day %in% c("Sun", "Sat"), "Weekend", "Weekday"),
    time_of_day = factor(case_when(
      hour(started_at) < 7 | hour(started_at) > 18 ~ "Overnight",
      hour(started_at) >= 7 & hour(started_at) < 10 ~ "AM Rush",
      hour(started_at) >= 10 & hour(started_at) < 15 ~ "Mid-Day",
      hour(started_at) >= 15 & hour(started_at) < 19 ~ "PM Rush"
    ), levels = c("AM Rush", "Mid-Day", "PM Rush", "Overnight"))
  ) %>%
  group_by(station_name, weekend, time_of_day) %>%
  tally() %>%
  arrange(n) %>%
  st_sf()

ggplot() +
  geom_sf(data = dc_nhoods, fill = "gray98", color = "gray85", size = 0.2) +
  geom_sf(data = spatiotemporal_map_data, aes(color = n), size = 3) +
  scale_color_viridis_c(direction = -1, option = "E") +
  facet_grid(weekend ~ time_of_day) +
  labs(title = "Trips by time and station, Oct-Nov 2021") +
  mapTheme()

A single day’s trips, animated

Finally, the animation below visualizes trip starts over space and time in 15-minute intervals for Saturday, October 9, 2021 – the date with the highest number of total trips during the period studied. Relatively few stations originated more than one trip in any given 15-minute period, but the format allows us to visualize

gradual spatial trends in movement patterns throughout the day.

trips_by_date <-
  trip_panel %>%
  mutate(date = date(hour)) %>%
  group_by(date) %>%
  summarize(trips = sum(trips))

animation_week_trips <- 
  filter(trips, date(hour) == as.Date("2021-10-09")) %>%
  dplyr::select(station_name, quarter_hour)


animation_week_panel <- 
  expand_grid(
    quarter_hour = unique(animation_week_trips$quarter_hour),
    station_name = unique(stations$station_name)
  )

animation_data <-
  left_join(animation_week_panel, animation_week_trips) %>%
  group_by(quarter_hour, station_name) %>%
  summarize(trips = n()) %>%
  ungroup() %>%
  left_join(stations) %>%
  mutate(
    trips_factor = factor(
      case_when(
        trips == 0 ~ "None",
        trips <= 3 ~ "1 to 3",
        trips <= 6 ~ "4 to 6",
        trips <= 9 ~ "7 to 9",
        trips >= 10 ~ "10+"
      ), 
      levels = c("None", "1 to 3", "4 to 6", "7 to 10")
    )
  ) %>%
  arrange(trips) %>%
  st_sf()

animation <-
  ggplot() +
  geom_sf(data = dc_nhoods, fill = "gray98", color = "gray85", size = 0.2) +
  geom_sf(data = animation_data, aes(color = trips), size = 3) +
  scale_color_viridis_c("Trips", direction = -1, option = "E") +
  labs(
    title = "Trip origins by station for one day",
    subtitle = "15-minute intervals: {current_frame}"
  ) +
  transition_manual(quarter_hour) +
  mapTheme()
animate(animation, duration = 20, renderer = gifski_renderer())


Model testing and validation

To assess the efficacy of various model features, I tested and compared four different linear regressions. The first included hour, day of week, and weather alone; the second included hour, weather, and station alone; the third included both time and station features along with weather; and the fourth added the lagged trip features.

# --- define regressions to test ---

# time and weather alone
reg1 <- lm(trips ~ as.factor(hour(hour)) + day + temperature_imputed + 
             precip_1h_imputed + windspeed + humidity_imputed, data = training)

# space and weather alone
reg2 <- lm(trips ~ temperature_imputed + precip_1h_imputed + windspeed + 
             humidity_imputed + station_name, data = training)

# space, time, and weather
reg3 <- lm(trips ~ as.factor(hour(hour)) + day + temperature_imputed + 
             precip_1h_imputed + windspeed + humidity_imputed + 
             station_name, data = training)

# space, time, weather, and lagged trips
reg4 <- lm(trips ~ as.factor(hour(hour)) + day + 
             temperature_imputed + precip_1h_imputed + windspeed + 
             humidity_imputed + lag_1h + lag_2h + lag_3h + lag_4h + 
             lag_12h + lag_24h + station_name, data = training)


# --- validate on test set ---

# join station geometries to test set
test <- left_join(test, stations)

# nest test set by week
test_week_nest <- 
  test %>%
  nest(-week)

# define function to generate predictions
model_pred <- function(data, fit) {
  predictions <- predict(fit, newdata = data)
}

# generate predictions
week_predictions <-
  test_week_nest %>%
  mutate(
    A_Time_FE = map(.x = data, fit = reg1, .f = model_pred),
    B_Space_FE = map(.x = data, fit = reg2, .f = model_pred),
    C_SpaceTime_FE = map(.x = data, fit = reg3, .f = model_pred),
    D_SpaceTime_Lag = map(.x = data, fit = reg4, .f = model_pred)
  )

# calculate error metrics
week_predictions <-
  week_predictions %>%
  gather(Regression, Prediction, -data, -week) %>%
  mutate(
    Observed = map(data, pull, trips),
    Absolute_Error = map2(Observed, Prediction, ~ abs(.x - .y)),
    MAE = map_dbl(Absolute_Error, mean),
    sd_AE = map_dbl(Absolute_Error, sd)
  )

Mean errors by model and week

As the chart below shows, the models incorporating spatial information slightly outperformed the one that did not, and the model that also incorporated time-lagged trip data was the best-performing of the four. However, the differences were less dramatic than we might have hoped.

The MAE of just under 0.9 for the best-performing model seems impressive until you consider that the mean number of trips across all observations in the panel was only 0.997. Unfortunately, the presence of records with no recorded trips meant I couldn’t divide by 0 to compute mean absolute percent error (MAPE), but plotting MAE against number of trips or cross-validating high- and low-ridership stations could be a worthwhile avenue for further exploration.

Also, a shortcoming of using linear regression to generate predictions is that the model predicted a negative number of trips for quite a few stations. I considered using some conditional logic to set all negative predictions to 0, but figuring out how to do anything in nested tibbles made me want to stab my eyes out, so I didn’t.

week_predictions %>%
  dplyr::select(week, Regression, MAE) %>%
  gather(variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) +
    geom_bar(aes(fill = Regression), stat = "identity", position = "dodge") +
    scale_fill_manual(values = palette5) +
    scale_x_continuous(breaks = c(43, 44)) +
    labs(
      title = "Mean absolute errors",
      subtitle = "by model and week",
      x = "Week"
    ) +
    plotTheme() +
    theme(legend.position = "bottom")

Errors across time by model

The next chart shows prediction errors for each model with more granularity across time, both indicating where each performed well and pointing to some areas for improvement. Interestingly, although it still performed the worst by far, even the model that incorporated no explicit time-derived features still captured the existence of some cyclical pattern, even if it fell far short of the magnitude – possibly because some of the weather features, like temperature and humidity, indirectly contributed some information about time of day.

The Friday and Saturday before Halloween seem to have been associated with some of the largest prediction errors, which is probably not surprising. Lessons there might be that a) incorporating holidays into this type of model could be sensible; b) hard-coding the holiday date is probably not the way to do this, at least for Halloween.

pred_time_plot_data <- 
  week_predictions %>%
  mutate(
    hour = map(data, pull, hour),
    station_name = map(data, pull, station_name),
  ) %>%
  dplyr::select(hour, station_name, Observed, Prediction, Regression) %>%
  unnest() %>%
  mutate(
    prediction_0floor = if_else(Prediction < 0, 0, Prediction)
  ) %>%
  rename(Predicted = Prediction) %>%
  gather(variable, value, -Regression, -hour, -station_name) %>%
  group_by(Regression, variable, hour) %>%
  summarize(value = mean(value))

filter(pred_time_plot_data, variable != "prediction_0floor") %>%
  ggplot(aes(hour, value, color = variable)) +
  geom_line() +
  geom_vline(xintercept = halloween, linetype = "dotted") +
  geom_vline(data = mondays, aes(xintercept = hour), size = 0.25) +
  facet_wrap(~Regression, ncol = 1) +
  scale_color_manual(values = palette2) +
  labs(title = "Predicted and observed trips by hour",
       subtitle = "Solid lines for Mondays; dotted line for Halloween",
       x = "Trips",
       y = "Date") +
  plotTheme() +
  theme(legend.position = "bottom")

Mean errors by station

Finally, the map below shows mean absolute error by station for each of the two weeks in the test set. This is more than anything a map of high-ridership stations, and mapping MAPE by station would probably have been more useful here. There does not appear to be any discernible difference in the spatial distribution of prediction errors between the two weeks, at least at first glance.

errors_by_week <- 
  filter(week_predictions, Regression == "D_SpaceTime_Lag") %>%
  unnest() %>%
  dplyr::select(station_name, Absolute_Error, week, geometry) %>%
  gather(variable, value, -station_name, -week, -geometry) %>%
  group_by(variable, station_name, week) %>%
  summarize(MAE = mean(value)) %>%
  left_join(., stations) %>%
  arrange(MAE) %>%
  st_sf()
    
ggplot() +
  geom_sf(data = dc_nhoods, fill = "gray98", color = "gray80", size = 0.2) +
  geom_sf(data = errors_by_week, aes(color = MAE), size = 2) +
  scale_color_viridis_c("MAE", direction = -1, option = "F") +
  facet_wrap(~week) +
  labs(title = "Mean absolute error by station by week",
       subtitle = "Oct 25 to Nov 7, 2021") +
  mapTheme()

Cross-validation

As another measure of the model’s quality, I performed k-fold cross-validation across 20 randomly selected folds. (I wanted to do 100, but it was taking hours to run and I needed to submit this assignment.) The histogram and table below show the distribution of errors across the folds.

# --- perform k-fold cross-validation ---

# set parameters
control <- trainControl(method = "cv", number = 20)
set.seed(337)

# train model
reg_cv <-
  train(
    trips ~ .,
    data = trip_panel,
    method = "lm",
    trControl = control,
    na.action = na.omit
  )

# retrieve MAE for all folds
cv_mae <- data.frame(MAE = reg_cv$resample[, 3])

# plot MAE distribution as histogram
ggplot() +
  geom_histogram(data = cv_mae, 
                 aes(x = MAE), 
                 fill = "#6baed6",
                 color = "#ffffff",
                 binwidth = 0.002) +
  geom_vline(xintercept = mean(cv_mae$MAE), color = "gray10", linetype = 2) +
  scale_x_continuous(breaks = round(seq(0, max(cv_mae$MAE), by = 0.005), 4)) +
  labs(title = "Distribution of mean absolute error",
       subtitle = "k-fold cross-validation; k = 20; dashed line at mean",
       x = "MAE",
       y = "Folds") +
  plotTheme()

# calculate mean and standard deviation MAE
validation_table <- data.frame(Mean = round(mean(cv_mae$MAE), 3),
                              StdDev = round(sd(cv_mae$MAE), 3))

# generate polished table
validation_table %>%
  kbl(caption = "MAE across 20 folds", digits = 3) %>%
  kable_classic(full_width = FALSE)
MAE across 20 folds
Mean StdDev
0.899 0.01

Conclusion

How useful this model is probably depends on how good Capital Bikeshare’s data scientists are. Truthfully, I had some difficulty figuring out how to assess it: even the best of the four regressions had a fairly low R-squared of about 0.48; the lagged trips barely correlated with trip numbers at all; the best of the four models tested had a lower MAE than the worst one, but not by an astonishing amount. Despite that, the plot of predicted and observed trips for the model with space-time and lagged features looks compelling – so what to make of it? I honestly don’t know. As always, I’d be very interested to know how different bikeshare systems model this currently.


Citation for missing value imputation: Moritz, Steffen, and Bartz-Beielstein, Thomas. “imputeTS: Time Series Missing Value Imputation in R.” R Journal 9.1 (2017). doi: 10.32614/RJ-2017-009.